{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5fd9842d",
   "metadata": {},
   "source": [
    "# How to compute DF (document frequency) counts using `pke`\n",
    "\n",
    "DF counts are required by several keyphrase extraction models (e.g. TfIdf, Kea) for weighting candidates. Below is an example on how to compute DF counts for a given text collection."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a90172a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting git+https://github.com/boudinfl/pke.git\n",
      "  Cloning https://github.com/boudinfl/pke.git to /private/var/folders/_s/dsym612j14gggkqchsd35clh0000gn/T/pip-req-build-m0yraixf\n",
      "  Running command git clone --filter=blob:none --quiet https://github.com/boudinfl/pke.git /private/var/folders/_s/dsym612j14gggkqchsd35clh0000gn/T/pip-req-build-m0yraixf\n",
      "  Resolved https://github.com/boudinfl/pke.git to commit dec50293ffd25f63a554e5822af13608686bfc77\n",
      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25hRequirement already satisfied: nltk in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (3.7)\n",
      "Requirement already satisfied: networkx in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (2.7.1)\n",
      "Requirement already satisfied: numpy in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (1.22.3)\n",
      "Requirement already satisfied: scipy in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (1.8.0)\n",
      "Requirement already satisfied: sklearn in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (0.0)\n",
      "Requirement already satisfied: unidecode in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (1.3.4)\n",
      "Requirement already satisfied: future in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (0.18.2)\n",
      "Requirement already satisfied: joblib in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (1.1.0)\n",
      "Requirement already satisfied: spacy>=3.2.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (3.2.3)\n",
      "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (0.4.0)\n",
      "Requirement already satisfied: setuptools in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (60.9.3)\n",
      "Requirement already satisfied: pathy>=0.3.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (0.6.1)\n",
      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (2.0.7)\n",
      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (1.8.2)\n",
      "Requirement already satisfied: jinja2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.3)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (2.27.1)\n",
      "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (0.9.0)\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (1.0.6)\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (2.0.6)\n",
      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (1.0.1)\n",
      "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (2.4.2)\n",
      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.9)\n",
      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (3.3.0)\n",
      "Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (8.0.15)\n",
      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (4.63.1)\n",
      "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (0.7.7)\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.6)\n",
      "Requirement already satisfied: packaging>=20.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (21.3)\n",
      "Requirement already satisfied: regex>=2021.8.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from nltk->pke==2.0.0) (2022.3.15)\n",
      "Requirement already satisfied: click in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from nltk->pke==2.0.0) (8.0.4)\n",
      "Requirement already satisfied: scikit-learn in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from sklearn->pke==2.0.0) (1.0.2)\n",
      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from packaging>=20.0->spacy>=3.2.3->pke==2.0.0) (3.0.7)\n",
      "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pathy>=0.3.5->spacy>=3.2.3->pke==2.0.0) (5.2.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy>=3.2.3->pke==2.0.0) (4.1.1)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (2021.10.8)\n",
      "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (2.0.12)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (3.3)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (1.26.9)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from jinja2->spacy>=3.2.3->pke==2.0.0) (2.1.1)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from scikit-learn->sklearn->pke==2.0.0) (3.1.0)\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.1.1 is available.\n",
      "You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/pke/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0mRequirement already satisfied: datasets in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (2.2.2)\n",
      "Requirement already satisfied: dill<0.3.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (0.3.4)\n",
      "Requirement already satisfied: fsspec[http]>=2021.05.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (2022.5.0)\n",
      "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (0.6.0)\n",
      "Requirement already satisfied: tqdm>=4.62.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (4.63.1)\n",
      "Requirement already satisfied: pyarrow>=6.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (8.0.0)\n",
      "Requirement already satisfied: numpy>=1.17 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (1.22.3)\n",
      "Requirement already satisfied: requests>=2.19.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (2.27.1)\n",
      "Requirement already satisfied: xxhash in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (3.0.0)\n",
      "Requirement already satisfied: packaging in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (21.3)\n",
      "Requirement already satisfied: responses<0.19 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (0.18.0)\n",
      "Requirement already satisfied: multiprocess in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (0.70.12.2)\n",
      "Requirement already satisfied: pandas in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (1.4.2)\n",
      "Requirement already satisfied: aiohttp in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (3.8.1)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: filelock in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.7.0)\n",
      "Requirement already satisfied: pyyaml in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.1.1)\n",
      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from packaging->datasets) (3.0.7)\n",
      "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2.0.12)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (3.3)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (1.26.9)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2021.10.8)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (6.0.2)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (1.7.2)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (21.4.0)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (4.0.2)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (1.2.0)\n",
      "Requirement already satisfied: python-dateutil>=2.8.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pandas->datasets) (2022.1)\n",
      "Requirement already satisfied: six>=1.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.1.1 is available.\n",
      "You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/pke/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0mCollecting en-core-web-sm==3.2.0\n",
      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.9/13.9 MB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: spacy<3.3.0,>=3.2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from en-core-web-sm==3.2.0) (3.2.3)\n",
      "Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.15)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.27.1)\n",
      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3.0)\n",
      "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.4.0)\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.6)\n",
      "Requirement already satisfied: jinja2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.3)\n",
      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.63.1)\n",
      "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.2)\n",
      "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.7.7)\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.6)\n",
      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.7)\n",
      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.9)\n",
      "Requirement already satisfied: pathy>=0.3.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.6.1)\n",
      "Requirement already satisfied: packaging>=20.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (21.3)\n",
      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.1)\n",
      "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.9.0)\n",
      "Requirement already satisfied: numpy>=1.15.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.22.3)\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
      "Requirement already satisfied: setuptools in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (60.9.3)\n",
      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.8.2)\n",
      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.7)\n",
      "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (5.2.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.1.1)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2021.10.8)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.26.9)\n",
      "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.12)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.4)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.1.1)\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.1.1 is available.\n",
      "You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/pke/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0m\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
      "You can now load the package via spacy.load('en_core_web_sm')\n"
     ]
    }
   ],
   "source": [
    "!pip install git+https://github.com/boudinfl/pke.git\n",
    "!pip install datasets\n",
    "!python -m spacy download en_core_web_sm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5f2fbf76",
   "metadata": {},
   "source": [
    "## Preamble on keyphrase extraction datasets using 🤗 datasets\n",
    "\n",
    "For simplicity and ease of use, we rely on the `datasets` module from 🤗 huggingface to load and access sample documents from keyphrase extraction datasets. Please have a look at our organization page (https://huggingface.co/taln-ls2n) for more information on which datasets are available."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e4f2083a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No config specified, defaulting to: inspec/raw\n",
      "Reusing dataset inspec (/Users/boudin-f/.cache/huggingface/datasets/taln-ls2n___inspec/raw/1.1.0/0ae146cabe770846946b3279b4c751efe0aca2dd68b3f24427d4624cd22bb20d)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2337d3ece8a54cea954415debf71c99b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "# load the inspec dataset\n",
    "dataset = load_dataset('taln-ls2n/inspec')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a3bc6276",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c09274deb06c489ea3be405710a9d0b6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1000 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import spacy\n",
    "from tqdm.notebook import tqdm\n",
    "from spacy.tokenizer import _get_regex_pattern\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_sm\")\n",
    "\n",
    "# Tokenization fix for in-word hyphens (e.g. 'non-linear' would be kept \n",
    "# as one token instead of default spacy behavior of 'non', '-', 'linear')\n",
    "# https://spacy.io/usage/linguistic-features#native-tokenizer-additions\n",
    "\n",
    "from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER\n",
    "from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS\n",
    "from spacy.util import compile_infix_regex\n",
    "\n",
    "# Modify tokenizer infix patterns\n",
    "infixes = (\n",
    "    LIST_ELLIPSES\n",
    "    + LIST_ICONS\n",
    "    + [\n",
    "        r\"(?<=[0-9])[+\\-\\*^](?=[0-9-])\",\n",
    "        r\"(?<=[{al}{q}])\\.(?=[{au}{q}])\".format(\n",
    "            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES\n",
    "        ),\n",
    "        r\"(?<=[{a}]),(?=[{a}])\".format(a=ALPHA),\n",
    "        # ✅ Commented out regex that splits on hyphens between letters:\n",
    "        # r\"(?<=[{a}])(?:{h})(?=[{a}])\".format(a=ALPHA, h=HYPHENS),\n",
    "        r\"(?<=[{a}0-9])[:<>=/](?=[{a}])\".format(a=ALPHA),\n",
    "    ]\n",
    ")\n",
    "\n",
    "infix_re = compile_infix_regex(infixes)\n",
    "nlp.tokenizer.infix_finditer = infix_re.finditer\n",
    "\n",
    "# populates a docs list with spacy doc objects\n",
    "docs = []\n",
    "for sample in tqdm(dataset['train']):\n",
    "    docs.append(nlp(sample[\"title\"]+\". \"+sample[\"abstract\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d0e159c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:root:1000 docs, memory used: 10.000099182128906 mb\n"
     ]
    }
   ],
   "source": [
    "import logging\n",
    "from pke import compute_document_frequency\n",
    "from string import punctuation\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "\n",
    "# compute idf weights\n",
    "compute_document_frequency(\n",
    "    documents=docs,\n",
    "    output_file='inspec.df.gz',\n",
    "    language='en',              # language of the input files\n",
    "    normalization='stemming',   # use porter stemmer\n",
    "    stoplist=list(punctuation), # stoplist (punctuation marks)\n",
    "    n=5                         # compute n-grams up to 5-grams\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "686e7a45",
   "metadata": {},
   "source": [
    "# Loading DF counts and benchmarking TfIdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a7962769",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5ba75fac6c4f4a59a37131def3008a3c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/500 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pke import load_document_frequency_file\n",
    "\n",
    "df = load_document_frequency_file(input_file='inspec.df.gz')\n",
    "\n",
    "# populates a docs list with spacy doc objects\n",
    "docs = []\n",
    "for sample in tqdm(dataset['test']):\n",
    "    docs.append(nlp(sample[\"title\"]+\". \"+sample[\"abstract\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a823e40e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2d7c969e09b04c36a908f501c4d298ea",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/500 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pke.unsupervised import TfIdf\n",
    "extractor = TfIdf()\n",
    "outputs = []\n",
    "for i, doc in enumerate(tqdm(docs)):\n",
    "    extractor.load_document(input=doc, language='en')\n",
    "    extractor.grammar_selection(grammar=\"NP: {<ADJ>*<NOUN|PROPN>+}\")\n",
    "    extractor.candidate_weighting(df=df)\n",
    "    outputs.append([u for u,v in extractor.get_n_best(n=5, stemming=True)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b332ad5f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "19d1b24a01774e458aceb0996b1d1a87",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/500 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from nltk.stem.snowball import SnowballStemmer as Stemmer\n",
    "\n",
    "# populates the references list with stemmed keyphrases\n",
    "references = []\n",
    "for sample in tqdm(dataset['test']):\n",
    "    sample_keyphrases = []\n",
    "    for keyphrase in sample[\"keyphrases\"]:\n",
    "        # tokenize keyphrase\n",
    "        tokens = [token.text for token in nlp(keyphrase)]\n",
    "        # normalize tokens using Porter's stemming\n",
    "        stems = [Stemmer('porter').stem(tok.lower()) for tok in tokens]\n",
    "        sample_keyphrases.append(\" \".join(stems))\n",
    "    references.append(sample_keyphrases)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d676dcc1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "885d92eeac454aff9b7d62a5bf4b16d2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/500 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: TfIdf P@5: 0.392 R@5: 0.244 F@5: 0.283\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "def evaluate(top_N_keyphrases, references):\n",
    "    P = len(set(top_N_keyphrases) & set(references)) / len(top_N_keyphrases)\n",
    "    R = len(set(top_N_keyphrases) & set(references)) / len(references)\n",
    "    F = (2*P*R)/(P+R) if (P+R) > 0 else 0 \n",
    "    return (P, R, F)\n",
    "\n",
    "\n",
    "# compute the P, R, F scores for the model\n",
    "scores = []\n",
    "for i, output in enumerate(tqdm(outputs)):\n",
    "    scores.append(evaluate(output, references[i]))\n",
    "    \n",
    "# compute the average scores\n",
    "avg_scores = np.mean(scores, axis=0)\n",
    "    \n",
    "# print out the performance of the model\n",
    "print(\"Model: TfIdf P@5: {:.3f} R@5: {:.3f} F@5: {:.3f}\".format(avg_scores[0], avg_scores[1], avg_scores[2]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
